In [1]:
import pandas as pd
import os

os.chdir(r'C:/Users/HP/Desktop')

df = pd.read_csv("Travel details dataset.csv")
In [2]:
df.head()
Out[2]:
Trip ID Destination Start date End date Duration (days) Traveler name Traveler age Traveler gender Traveler nationality Accommodation type Accommodation cost Transportation type Transportation cost
0 1 London, UK 5/1/2023 5/8/2023 7.0 John Smith 35.0 Male American Hotel 1200 Flight 600
1 2 Phuket, Thailand 6/15/2023 6/20/2023 5.0 Jane Doe 28.0 Female Canadian Resort 800 Flight 500
2 3 Bali, Indonesia 7/1/2023 7/8/2023 7.0 David Lee 45.0 Male Korean Villa 1000 Flight 700
3 4 New York, USA 8/15/2023 8/29/2023 14.0 Sarah Johnson 29.0 Female British Hotel 2000 Flight 1000
4 5 Tokyo, Japan 9/10/2023 9/17/2023 7.0 Kim Nguyen 26.0 Female Vietnamese Airbnb 700 Train 200
In [3]:
df.tail()
Out[3]:
Trip ID Destination Start date End date Duration (days) Traveler name Traveler age Traveler gender Traveler nationality Accommodation type Accommodation cost Transportation type Transportation cost
134 135 Rio de Janeiro, Brazil 8/1/2023 8/10/2023 9.0 Jose Perez 37.0 Male Brazilian Hostel 2500 Car 2000
135 136 Vancouver, Canada 8/15/2023 8/21/2023 6.0 Emma Wilson 29.0 Female Canadian Hotel 5000 Airplane 3000
136 137 Bangkok, Thailand 9/1/2023 9/8/2023 7.0 Ryan Chen 34.0 Male Chinese Hostel 2000 Train 1000
137 138 Barcelona, Spain 9/15/2023 9/22/2023 7.0 Sofia Rodriguez 25.0 Female Spanish Airbnb 6000 Airplane 2500
138 139 Auckland, New Zealand 10/1/2023 10/8/2023 7.0 William Brown 39.0 Male New Zealander Hotel 7000 Train 2500
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139 entries, 0 to 138
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Trip ID               139 non-null    int64  
 1   Destination           137 non-null    object 
 2   Start date            137 non-null    object 
 3   End date              137 non-null    object 
 4   Duration (days)       137 non-null    float64
 5   Traveler name         137 non-null    object 
 6   Traveler age          137 non-null    float64
 7   Traveler gender       137 non-null    object 
 8   Traveler nationality  137 non-null    object 
 9   Accommodation type    137 non-null    object 
 10  Accommodation cost    137 non-null    object 
 11  Transportation type   136 non-null    object 
 12  Transportation cost   136 non-null    object 
dtypes: float64(2), int64(1), object(10)
memory usage: 14.2+ KB
In [5]:
df.describe()
Out[5]:
Trip ID Duration (days) Traveler age
count 139.000000 137.000000 137.000000
mean 70.000000 7.605839 33.175182
std 40.269923 1.601276 7.145441
min 1.000000 5.000000 20.000000
25% 35.500000 7.000000 28.000000
50% 70.000000 7.000000 31.000000
75% 104.500000 8.000000 38.000000
max 139.000000 14.000000 60.000000
In [7]:
df.shape
Out[7]:
(139, 13)
In [8]:
print(df.columns)
Index(['Trip ID', 'Destination', 'Start date', 'End date', 'Duration (days)',
       'Traveler name', 'Traveler age', 'Traveler gender',
       'Traveler nationality', 'Accommodation type', 'Accommodation cost',
       'Transportation type', 'Transportation cost'],
      dtype='object')
In [12]:
# Check for missing values
print(df.isnull().sum())

# You can either drop rows with missing values or fill them
df = df.dropna()  # Drop rows with missing values
Trip ID                 0
Destination             2
Start date              2
End date                2
Duration (days)         2
Traveler name           2
Traveler age            2
Traveler gender         2
Traveler nationality    2
Accommodation type      2
Accommodation cost      2
Transportation type     3
Transportation cost     3
dtype: int64
In [33]:
# Convert 'Start date' and 'End date' to datetime format
df['Start date'] = pd.to_datetime(df['Start date'], errors='coerce')
df['End date'] = pd.to_datetime(df['End date'], errors='coerce')

# Calculate trip duration based on dates (Duration in days)
df['Duration (days)'] = (df['End date'] - df['Start date']).dt.days

# Check the new dataset
print(df[['Start date', 'End date', 'Duration (days)']].head())
  Start date   End date  Duration (days)
0 2023-05-01 2023-05-08                7
1 2023-06-15 2023-06-20                5
2 2023-07-01 2023-07-08                7
3 2023-08-15 2023-08-29               14
4 2023-09-10 2023-09-17                7
In [34]:
# Convert 'Start date' and 'End date' to datetime format
df['Start date'] = pd.to_datetime(df['Start date'], errors='coerce')
df['End date'] = pd.to_datetime(df['End date'], errors='coerce')

# Calculate trip duration based on dates (Duration in days)
df['Duration (days)'] = (df['End date'] - df['Start date']).dt.days

# Check the new dataset
print(df[['Start date', 'End date', 'Duration (days)']].head())
  Start date   End date  Duration (days)
0 2023-05-01 2023-05-08                7
1 2023-06-15 2023-06-20                5
2 2023-07-01 2023-07-08                7
3 2023-08-15 2023-08-29               14
4 2023-09-10 2023-09-17                7
In [35]:
# Remove non-numeric characters (like '$', 'USD', etc.) from 'Accommodation cost' and 'Transportation cost'
df['Accommodation cost'] = df['Accommodation cost'].replace(r'[^\d.]', '', regex=True).astype(float)
df['Transportation cost'] = df['Transportation cost'].replace(r'[^\d.]', '', regex=True).astype(float)

# Fill any missing values with the median (if applicable)
df['Accommodation cost'].fillna(df['Accommodation cost'].median(), inplace=True)
df['Transportation cost'].fillna(df['Transportation cost'].median(), inplace=True)

# Check the cleaned columns
print(df[['Accommodation cost', 'Transportation cost']].head())
   Accommodation cost  Transportation cost
0              1200.0                600.0
1               800.0                500.0
2              1000.0                700.0
3              2000.0               1000.0
4               700.0                200.0
In [17]:
df.describe().T
Out[17]:
count mean std min 25% 50% 75% max
Trip ID 136.0 69.463235 40.388284 1.0 34.75 68.5 104.25 139.0
Duration (days) 136.0 7.536765 1.558290 5.0 7.00 7.0 8.00 14.0
Traveler age 136.0 33.117647 7.139933 20.0 28.00 31.0 37.25 60.0
Accommodation cost 136.0 1253.529412 1338.644162 150.0 600.00 900.0 1200.00 8000.0
Transportation cost 136.0 645.183824 584.476153 20.0 200.00 550.0 800.00 3000.0
In [19]:
df = df.dropna()
df = df.reset_index()
In [37]:
col_list = ["Transportation cost", "Accommodation cost"]

for col in col_list:
    df[col] = df[col].astype(str)
    df[col] = df[col].str.replace("$", "")
    df[col] = df[col].str.replace(" USD", "")
    df[col] = df[col].str.replace(",", "")
    df[col] = pd.to_numeric(df[col])
C:\Users\HP\AppData\Local\Temp\ipykernel_2396\1584975083.py:5: FutureWarning:

The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.

In [22]:
df["Transportation type"] = df["Transportation type"].astype(str)
In [24]:
group_accommodation_type = df.groupby("Accommodation type").agg({"Trip ID":"count", "Duration (days)":"mean", "Traveler age":"mean",
                                        "Accommodation cost" : "mean", "Transportation type" : "max", 
                                        "Transportation cost" : "mean"}).reset_index()
In [25]:
group_nationality = df.groupby("Traveler nationality").agg({"Trip ID":"count", "Duration (days)":"mean", "Traveler age":"mean",
                                        "Accommodation cost" : "mean", "Transportation type" : "max", 
                                        "Transportation cost" : "mean"}).reset_index()
In [26]:
group_transportation_type = df.groupby("Transportation type").agg({"Trip ID":"count", "Duration (days)":"mean", "Traveler age":"mean",
                                                                   "Accommodation cost" : "mean","Transportation cost" : "mean"}).reset_index()
In [28]:
group_destination_nationality = df.groupby(["Destination", "Traveler nationality"]).agg({"Accommodation cost":"mean", "Trip ID": "count"}).reset_index()
group_destination_nationality = group_destination_nationality.sort_values("Accommodation cost", ascending = False)
In [30]:
import plotly.express as px
import plotly.graph_objs as go
import warnings
In [31]:
group_accommodation_type = group_accommodation_type.sort_values(by='Accommodation cost')

sorted_accommodation_types = group_accommodation_type['Accommodation type'].unique()

fig = px.bar(
    group_accommodation_type,
    y='Accommodation type',
    x='Accommodation cost',
    color='Accommodation cost',
    orientation='h',
    color_continuous_scale='blugrn'  
)


fig.update_layout(
title_x=0.5, 
    title_font_size=20,  
    legend=dict(
        font=dict(size=12),  
        bgcolor='rgba(255, 255, 255, 0.5)',  
        bordercolor='rgba(0, 0, 0, 0.5)',  
        borderwidth=2  
    ),
    margin=dict(
        l=50,
        r=50,
        b=50,
        t=80
    ))


fig.show()
02004006008001000120014001600GuesthouseRiadHostelVacation rentalAirbnbVillaHotelResort
400600800100012001400Accommodation costAccommodation costAccommodation type
plotly-logomark
In [38]:
import pandas as pd
import matplotlib.pyplot as plt

# take only head(5) 
data = {
    'Trip ID': [134, 135, 136, 137, 138],
    'Destination': ['Rio de Janeiro, Brazil', 'Vancouver, Canada', 'Bangkok, Thailand', 'Barcelona, Spain', 'Auckland, New Zealand'],
    'Start date': ['8/1/2023', '8/15/2023', '9/1/2023', '9/15/2023', '10/1/2023'],
    'End date': ['8/10/2023', '8/21/2023', '9/8/2023', '9/22/2023', '10/8/2023'],
    'Duration (days)': [9, 6, 7, 7, 7],
    'Traveler name': ['Jose Perez', 'Emma Wilson', 'Ryan Chen', 'Sofia Rodriguez', 'William Brown'],
    'Traveler age': [37, 29, 34, 25, 39],
    'Traveler gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'Traveler nationality': ['Brazilian', 'Canadian', 'Chinese', 'Spanish', 'New Zealander'],
    'Accommodation type': ['Hostel', 'Hotel', 'Hostel', 'Airbnb', 'Hotel'],
    'Accommodation cost': [2500, 5000, 2000, 6000, 7000],
    'Transportation type': ['Car', 'Airplane', 'Train', 'Airplane', 'Train'],
    'Transportation cost': [2000, 3000, 1000, 2500, 2500]
}

# Create DataFrame
df = pd.DataFrame(data)

# Calculate average transportation cost by transportation type
avg_transport_cost = df.groupby('Transportation type')['Transportation cost'].mean()

# Plotting
plt.figure(figsize=(10, 6))
avg_transport_cost.plot(kind='bar')
plt.title('Average Transportation Cost by Transportation Type')
plt.xlabel('Transportation Type')
plt.ylabel('Average Transportation Cost')
plt.xticks(rotation=45)
plt.show()
In [39]:
import pandas as pd
import matplotlib.pyplot as plt

# Sample data
data = {
    'Trip ID': [134, 135, 136, 137, 138],
    'Destination': ['Rio de Janeiro, Brazil', 'Vancouver, Canada', 'Bangkok, Thailand', 'Barcelona, Spain', 'Auckland, New Zealand'],
    'Start date': ['8/1/2023', '8/15/2023', '9/1/2023', '9/15/2023', '10/1/2023'],
    'End date': ['8/10/2023', '8/21/2023', '9/8/2023', '9/22/2023', '10/8/2023'],
    'Duration (days)': [9, 6, 7, 7, 7],
    'Traveler name': ['Jose Perez', 'Emma Wilson', 'Ryan Chen', 'Sofia Rodriguez', 'William Brown'],
    'Traveler age': [37, 29, 34, 25, 39],
    'Traveler gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'Traveler nationality': ['Brazilian', 'Canadian', 'Chinese', 'Spanish', 'New Zealander'],
    'Accommodation type': ['Hostel', 'Hotel', 'Hostel', 'Airbnb', 'Hotel'],
    'Accommodation cost': [2500, 5000, 2000, 6000, 7000],
    'Transportation type': ['Car', 'Airplane', 'Train', 'Airplane', 'Train'],
    'Transportation cost': [2000, 3000, 1000, 2500, 2500]
}

# Create DataFrame
df = pd.DataFrame(data)

# Plotting the relationship between Duration (days) and Accommodation cost
plt.figure(figsize=(10, 6))
plt.scatter(df['Duration (days)'], df['Accommodation cost'])
plt.title('Relationship between Duration of Stay and Accommodation Cost')
plt.xlabel('Duration (days)')
plt.ylabel('Accommodation Cost')
plt.grid(True)
plt.show()
In [41]:
import pandas as pd
import matplotlib.pyplot as plt

# Sample data
data = {
    'Trip ID': [134, 135, 136, 137, 138],
    'Destination': ['Rio de Janeiro, Brazil', 'Vancouver, Canada', 'Bangkok, Thailand', 'Barcelona, Spain', 'Auckland, New Zealand'],
    'Start date': ['8/1/2023', '8/15/2023', '9/1/2023', '9/15/2023', '10/1/2023'],
    'End date': ['8/10/2023', '8/21/2023', '9/8/2023', '9/22/2023', '10/8/2023'],
    'Duration (days)': [9, 6, 7, 7, 7],
    
    'Traveler nationality': ['Brazilian', 'Canadian', 'Chinese', 'Spanish', 'New Zealander'],
    'Accommodation type': ['Hostel', 'Hotel', 'Hostel', 'Airbnb', 'Hotel'],
    'Accommodation cost': [2500, 5000, 2000, 6000, 7000],
    'Transportation type': ['Car', 'Airplane', 'Train', 'Airplane', 'Train'],
    'Transportation cost': [2000, 3000, 1000, 2500, 2500]
}

# Create DataFrame
df = pd.DataFrame(data)

# Calculate average duration by transportation type
avg_duration = df.groupby('Transportation type')['Duration (days)'].mean()

# Plotting
plt.figure(figsize=(10, 6))
avg_duration.plot(kind='bar', color=['skyblue', 'coral', 'limegreen'])
plt.title('Average Duration of Stay by Transportation Type')
plt.xlabel('Transportation Type')
plt.ylabel('Average Duration of Stay (days)')
plt.xticks(rotation=45)
plt.show()
In [43]:
# Calculate total travel cost
df['Total travel cost'] = df['Accommodation cost'] + df['Transportation cost'] 
print(df[['Trip ID', 'Destination', 'Total travel cost']])
   Trip ID             Destination  Total travel cost
0      134  Rio de Janeiro, Brazil               4500
1      135       Vancouver, Canada               8000
2      136       Bangkok, Thailand               3000
3      137        Barcelona, Spain               8500
4      138   Auckland, New Zealand               9500
In [45]:
# Sample only for first 5 people 
plt.figure(figsize=(12, 6))
plt.bar(df['Destination'], df['Total travel cost'], color='skyblue') 
plt.title('Total Travel Cost by Destination')
plt.xlabel('Destination')
plt.ylabel('Total Travel Cost')
plt.xticks(rotation=45, ha='right') 
plt.tight_layout() 
plt.show()
In [48]:
plt.figure(figsize=(12, 6)) 
bar_width = 0.4
positions = range(len(df['Destination']))
plt.bar(positions, df['Accommodation cost'], bar_width, label='Accommodation Cost', color='skyblue') 
plt.bar(positions, df['Transportation cost'], bar_width, bottom=df['Accommodation cost'], label='Transportation Cost', color='coral')
plt.title('Accommodation and Transportation Costs by Destination') 

plt.xlabel('Destination')
plt.ylabel('Cost')
plt.xticks(positions, df['Destination'], rotation=45, ha='right')
plt.legend() 
plt.tight_layout()
plt.show()
In [49]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histograms of 'Accommodation cost' and 'Transportation cost'
df[['Accommodation cost', 'Transportation cost']].hist(bins=20, figsize=(10, 5))
plt.tight_layout()
plt.show()

# Boxplot to check for outliers
sns.boxplot(data=df[['Accommodation cost', 'Transportation cost']])
plt.show()
In [ ]: